This notebook explores key trends and patterns in COVID-19 incidence and vaccination data. It analyzes infection rates by age groups, identifying which demographics experienced the highest infection rates, and tracks changes in infection and mortality rates over time to assess the impact of vaccination campaigns. Vaccination uptake is examined across demographics, highlighting gender-based differences and prioritization of high-risk groups, such as older adults. Additionally, the notebook studies vaccine dosage and preparation efficiency, identifying trends and shifts in vaccine administration over time. The effectiveness of vaccination facilities is also evaluated by exploring differences in COVID-19 case reductions across regions served by specific facilities. Hypothetical herd immunity thresholds are modeled to estimate the level of immunity achieved through vaccination and natural recovery. Finally, the notebook applies deep learning methods, including RNN, LSTM, BiLSTM, and GRU models, to forecast COVID-19 incidence data, providing valuable predictions for understanding the potential future trajectory of the pandemic.
# @title Preliminary summary stats of datasets
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# read files
vaccination = pd.read_excel('/content/drive/MyDrive/vaccination_data.xlsx')
covid = pd.read_csv('/content/drive/MyDrive/covid_19_morbidity_data.csv', sep=';')
# clean and translate column names
# create a dictionary for the column name translations with underscores and lowercase letters
vaccination_col_mapping = {
'Vakcinācijas iestādes kods': 'vaccination_facility_code',
'Vakcinācijas iestādes nosaukums': 'vaccination_facility_name',
'Vakcinācijas datums': 'vaccination_date',
'Vakcīnas veids': 'vaccine_type',
'Preparāts': 'preparation',
'Vakcinācijas posms': 'vaccination_stage',
'Vakcīnas kārtas numurs': 'vaccine_series_number',
'Preparāta daudzums ml': 'preparation_amount_ml',
'Vakcīnas ievadīšanas veids': 'vaccine_administration_method',
'Indikācijas vakcinācijai': 'vaccination_indications',
'Vakcinētās personas vecums': 'vaccinated_person_age',
'Vakcinētās personas dzimums': 'vaccinated_person_gender',
'Vakcinēto personu skaits': 'number_of_vaccinated_persons'
}
covid_col_mapping = {
'Datums': 'date',
'TestuSkaits': 'number_of_tests',
'ApstiprinataCOVID19InfekcijaSkaits': 'confirmed_covid19_cases',
'Ipatsvars': 'proportion',
'IzarstetoPacientuSkaits': 'number_of_recovered_patients',
'MirusoPersonuSkaits': 'number_of_deaths',
'ApstiprinatiVecGr_0-9Gadi': 'confirmed_age_group_0_9_years',
'ApstiprinatiVecGr_10-19Gadi': 'confirmed_age_group_10_19_years',
'ApstiprinatiVecGr_20-29Gadi': 'confirmed_age_group_20_29_years',
'ApstiprinatiVecGr_30-39Gadi': 'confirmed_age_group_30_39_years',
'ApstiprinatiVecGr_40-49Gadi': 'confirmed_age_group_40_49_years',
'ApstiprinatiVecGr_50-59Gadi': 'confirmed_age_group_50_59_years',
'ApstiprinatiVecGr_60-69Gadi': 'confirmed_age_group_60_69_years',
'ApstiprinatiVecGr_70GadiUnVairak': 'confirmed_age_group_70_and_older',
'ApstiprinatiVecGr_70-79Gadi': 'confirmed_age_group_70_79_years',
'ApstiprinatiVecGr_80GadiUnVairak': 'confirmed_age_group_80_and_older',
'IzveselojusosSkaits': 'number_of_recovered_cases',
'14DienuKumulativaSaslimstibaUz100000Iedzivotaju': '14_day_cumulative_infection_rate_per_100000_inhabitants',
'ApstCOVID19InfSk_NevakcVakcNepab': 'confirmed_covid19_cases_unvaccinated',
'ApstCOVID19InfSk_Vakc': 'confirmed_covid19_cases_vaccinated',
'ApstCOVID19InfSk_VakcNepab': 'confirmed_covid19_cases_vaccinated_unvaccinated',
'ApstCOVID19InfSk_Nevakc': 'confirmed_covid19_cases_unvaccinated_total',
'MirusoPersonuSkaits_NevakcVakcNepab': 'number_of_deaths_unvaccinated',
'MirusoPersonuSkaits_Vakc': 'number_of_deaths_vaccinated'
}
# renaming columns in the vaccination dataset
vaccination.rename(columns=vaccination_col_mapping, inplace=True)
# renaming columns in the covid dataset
covid.rename(columns=covid_col_mapping, inplace=True)
# datasets = {'vaccination': vaccination,
# 'covid': covid}
# for dataset in datasets:
# # print the dataset name
# print(f"\nDataset: {dataset}")
# # print the size of the dataset
# print(f"Size: {datasets[dataset].shape}")
# # print the table header
# print(f"{'Column Name':<60} {'# Unique Values':<20} {'# Missing Values':<20} {'Data Type':<15}")
# print("-" * 115)
# # loop through the columns and print details
# for col in datasets[dataset].columns:
# n_unique = datasets[dataset][col].nunique()
# missing_values = datasets[dataset][col].isnull().sum()
# data_type = str(datasets[dataset][col].dtype)
# print(f"{col:<60} {n_unique:<20} {missing_values:<20} {data_type:<15}")
# covid incidence data
# convert date to datetime
covid['date'] = pd.to_datetime(covid['date'])
# replace missing values ... with 0
for col in covid.columns:
covid[col] = covid[col].replace(['...', '…'], 0)
# replace nan values with 0
covid.fillna(0, inplace=True)
# convert object columns to float
for col in covid.columns:
if covid[col].dtype == 'object':
covid[col] = covid[col].astype('float64')
# look at the data until the end of 2022
# after 2022, there is no data by age groups,
# recovered cases are not reported after 2023-06
# Covid-19 incidence significantly drops
covid = covid[covid['date'] <= '2022-12-31']
# vaccination data
# convert vaccination_date to datetime
vaccination['vaccination_date'] = pd.to_datetime(vaccination['vaccination_date'])
# replace vaccinated_person_gender with binary 0 for men and 1 for women
vaccination['vaccinated_person_gender'] = vaccination['vaccinated_person_gender'].replace({'V': 0, 'S': 1})
vaccination['vaccinated_person_gender'] = pd.to_numeric(vaccination['vaccinated_person_gender'], downcast='integer')
# get the year-month for each date
vaccination['month'] = vaccination['vaccination_date'].dt.strftime('%Y-%m')
# get the year-month-week for each date
# determine the start of the week
vaccination['week_start'] = vaccination['vaccination_date'] - pd.to_timedelta(vaccination['vaccination_date'].dt.dayofweek, unit='d')
vaccination['week_end'] = vaccination['week_start'] + pd.Timedelta(days=6)
vaccination['week'] = vaccination['week_start'].dt.strftime('%Y-%m-%d') + ' - ' + vaccination['week_end'].dt.strftime('%Y-%m-%d')
# create age categories
vaccination['age_category'] = pd.cut(
vaccination['vaccinated_person_age'],
bins=[0, 19, 29, 39, 49, 59, 69, 100],
labels=['0-19', '20-29', '30-39', '40-49', '50-59', '60-69', '70+'])
# @title Covid incidence data
covid = covid.copy()
# calculate cumulative cases
covid['cumulative_confirmed_covid19_cases'] = covid['confirmed_covid19_cases'].cumsum()
# number of recovered cases is already a cumulative count (it seems there is a break in series where # of recovered jump from 1.2K to 13K)
# rename the column name
covid.rename(columns={'number_of_recovered_cases': 'cumulative_recovered_cases'}, inplace=True)
# calculate cumulative number of deaths
covid['cumulative_number_of_deaths'] = covid['number_of_deaths'].cumsum()
# calculate active cases
covid['active_cases'] = covid['cumulative_confirmed_covid19_cases'] - covid['cumulative_recovered_cases'] - covid['cumulative_number_of_deaths']
# calculate cumulative number of tests
covid['cumulative_num_tests'] = covid['number_of_tests'].cumsum()
# calculate the morbidity rate (cumulative cases vs. cumulative tests)
covid['morbidity_rate'] = covid['cumulative_confirmed_covid19_cases'] / covid['cumulative_num_tests'] * 100
# calculate the mortality rate (cumulative deaths vs. cumulative cases)
covid['mortality_rate'] = covid['cumulative_number_of_deaths'] / covid['cumulative_confirmed_covid19_cases'] * 100
# Calculate the daily recovered cases by taking the difference in cumulative recovered cases
covid['recovered_cases'] = covid['cumulative_recovered_cases'].diff()
# @title Setting style for charts
import warnings
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import logging
# Suppress all warnings
warnings.filterwarnings("ignore")
# Suppress matplotlib font-related and other warnings from logging
logging.getLogger('matplotlib').setLevel(logging.ERROR)
# Set "serif" as the global font family with specific styling parameters
mpl.rcParams.update({
"font.family": "serif",
"font.serif": ["Liberation Serif"], # Define fallback fonts if Times New Roman is unavailable
"font.weight": "normal",
"axes.titlesize": 16,
"axes.labelsize": 12,
"xtick.labelsize": 10,
"ytick.labelsize": 10,
})
# Set a Viridis color palette
viridis_colors = plt.cm.viridis(np.linspace(0, 1, 10))
COVID-19 Incidence by Age Groups
Analyze infection rates across various age groups to identify which demographics experienced the highest rates of infection.
Examine temporal trends to evaluate whether vaccination efforts had a noticeable impact on infection rates within specific age brackets.
# @title Monthly Covid Incidence by Age
import pandas as pd
import matplotlib.pyplot as plt
# Convert 'date' to datetime if it's not already
covid['date'] = pd.to_datetime(covid['date'])
# Define the age group columns and rename for clarity
age_columns = [
'confirmed_age_group_0_9_years', 'confirmed_age_group_10_19_years',
'confirmed_age_group_20_29_years', 'confirmed_age_group_30_39_years',
'confirmed_age_group_40_49_years', 'confirmed_age_group_50_59_years',
'confirmed_age_group_60_69_years', 'confirmed_age_group_70_and_older'
]
age_labels = {
'confirmed_age_group_0_9_years': '0-9',
'confirmed_age_group_10_19_years': '10-19',
'confirmed_age_group_20_29_years': '20-29',
'confirmed_age_group_30_39_years': '30-39',
'confirmed_age_group_40_49_years': '40-49',
'confirmed_age_group_50_59_years': '50-59',
'confirmed_age_group_60_69_years': '60-69',
'confirmed_age_group_70_and_older': '70+'
}
covid = covid.rename(columns=age_labels)
# Ensure we have monthly data to avoid overcrowding the plot
covid['month'] = covid['date'].dt.to_period('M')
covid_monthly = covid.groupby('month')[list(age_labels.values())].sum()
# Define vaccination start and end dates as periods
vaccination_start = pd.Period('2020-12', freq='M')
vaccination_end = pd.Period('2021-09', freq='M')
# Plotting the stacked bar plot
fig, ax = plt.subplots(figsize=(12, 6))
covid_monthly.plot(kind='bar', stacked=True, ax=ax, colormap="viridis", width=0.8)
# Add labels and formatting
ax.set_title('Monthly COVID-19 Incidence by Age Group')
ax.set_xlabel('Month')
ax.set_ylabel('Number of Confirmed Cases')
ax.legend(title="Age Groups", bbox_to_anchor=(1.05, 1), loc='upper left')
# Calculate x-positions for the vertical lines
start_x = (vaccination_start - covid_monthly.index[0]).n
end_x = (vaccination_end - covid_monthly.index[0]).n
# Add grey vertical lines with adjusted positions
plt.axvline(x=start_x, color='grey', linestyle='--', linewidth=1)
plt.axvline(x=end_x, color='grey', linestyle='--', linewidth=1)
# Add text annotations in the middle of the chart
y_middle = ax.get_ylim()[1] / 2 # Position halfway up the y-axis
ax.text(start_x, y_middle, 'Vaccination Start\n2020-12', color='grey', ha='center', rotation=90)
ax.text(end_x, y_middle, 'Vaccination End\n2021-09', color='grey', ha='center', rotation=90)
# Remove top, right, and left spines
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
# Only show ticks on bottom and left axis
ax.tick_params(axis='x', bottom=True, top=False)
ax.tick_params(axis='y', left=True, right=False)
# Show plot with adjustments
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
# @title COVID-19 Infection Rates by Age Group Over Time
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
# Convert 'date' to datetime if it's not already
covid['date'] = pd.to_datetime(covid['date'])
# Define the age group columns and rename for clarity
age_columns = [
'confirmed_age_group_0_9_years', 'confirmed_age_group_10_19_years',
'confirmed_age_group_20_29_years', 'confirmed_age_group_30_39_years',
'confirmed_age_group_40_49_years', 'confirmed_age_group_50_59_years',
'confirmed_age_group_60_69_years', 'confirmed_age_group_70_and_older'
]
age_labels = {
'confirmed_age_group_0_9_years': '0-9',
'confirmed_age_group_10_19_years': '10-19',
'confirmed_age_group_20_29_years': '20-29',
'confirmed_age_group_30_39_years': '30-39',
'confirmed_age_group_40_49_years': '40-49',
'confirmed_age_group_50_59_years': '50-59',
'confirmed_age_group_60_69_years': '60-69',
'confirmed_age_group_70_and_older': '70+'
}
covid = covid.rename(columns=age_labels)
# Ensure we have monthly data to avoid overcrowding the plot
covid['month'] = covid['date'].dt.to_period('M')
covid_monthly = covid.groupby('month')[list(age_labels.values())].sum()
# Define population estimates for 2022 based on provided data
population_estimates = {
'0-9': 200109,
'10-19': 192730,
'20-29': 180063,
'30-39': 268339,
'40-49': 250110,
'50-59': 257933,
'60-69': 247103,
'70+': 279370
}
# Calculate infection rates per 100,000 for each age group
infection_rates = covid_monthly.div(population_estimates.values(), axis=1) * 100000
# Define vaccination start and end dates
vaccination_start = pd.Period('2020-12', freq='M')
vaccination_end = pd.Period('2021-09', freq='M')
# Plotting the line plot
fig, ax = plt.subplots(figsize=(12, 6))
# Use the Viridis colormap
colors = plt.cm.viridis(np.linspace(0, 1, len(infection_rates.columns)))
for i, age_group in enumerate(infection_rates.columns):
ax.plot(infection_rates.index.to_timestamp(), infection_rates[age_group], label=age_group, color=colors[i])
# Add vertical lines for vaccination start and end
start_x = (vaccination_start - covid_monthly.index[0]).n
end_x = (vaccination_end - covid_monthly.index[0]).n
ax.axvline(x=infection_rates.index.to_timestamp()[start_x], color='grey', linestyle='--', linewidth=1)
ax.axvline(x=infection_rates.index.to_timestamp()[end_x], color='grey', linestyle='--', linewidth=1)
# Add text annotations for vaccination start and end
y_middle = ax.get_ylim()[1] / 2 # Position halfway up the y-axis
ax.text(infection_rates.index.to_timestamp()[start_x], y_middle, 'Vaccination Start\n2020-12', color='grey', ha='center', rotation=90)
ax.text(infection_rates.index.to_timestamp()[end_x], y_middle, 'Vaccination End\n2021-09', color='grey', ha='center', rotation=90)
# Add labels and formatting
ax.set_title('COVID-19 Infection Rates by Age Group Over Time')
ax.set_xlabel('Month')
ax.set_ylabel('Infection Rate (per 100,000 population)')
ax.legend(title="Age Groups", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=45)
# Remove top, right, and left spines
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
# Only show ticks on bottom and left axis
ax.tick_params(axis='x', bottom=True, top=False)
ax.tick_params(axis='y', left=True, right=False)
# Show plot with adjustments
plt.tight_layout()
plt.show()
# @title COVID-19 Infection Rates (per 1,000 population) by Age Group and Month
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Convert 'date' to datetime if it's not already
covid['date'] = pd.to_datetime(covid['date'])
# Define the age group columns and rename for clarity
age_columns = [
'confirmed_age_group_0_9_years', 'confirmed_age_group_10_19_years',
'confirmed_age_group_20_29_years', 'confirmed_age_group_30_39_years',
'confirmed_age_group_40_49_years', 'confirmed_age_group_50_59_years',
'confirmed_age_group_60_69_years', 'confirmed_age_group_70_and_older'
]
age_labels = {
'confirmed_age_group_0_9_years': '0-9',
'confirmed_age_group_10_19_years': '10-19',
'confirmed_age_group_20_29_years': '20-29',
'confirmed_age_group_30_39_years': '30-39',
'confirmed_age_group_40_49_years': '40-49',
'confirmed_age_group_50_59_years': '50-59',
'confirmed_age_group_60_69_years': '60-69',
'confirmed_age_group_70_and_older': '70+'
}
covid = covid.rename(columns=age_labels)
# Ensure we have monthly data to avoid overcrowding the plot
covid['month'] = covid['date'].dt.to_period('M')
covid_monthly = covid.groupby('month')[list(age_labels.values())].sum()
# Define population estimates for 2022 based on provided data
population_estimates = {
'0-9': 200109,
'10-19': 192730,
'20-29': 180063,
'30-39': 268339,
'40-49': 250110,
'50-59': 257933,
'60-69': 247103,
'70+': 279370
}
# Calculate infection rates per 1,000 for each age group
infection_rates = covid_monthly.div(population_estimates.values(), axis=1) * 1000
# Reset index for heatmap creation
infection_rates = infection_rates.reset_index()
# Melt the DataFrame for seaborn
heatmap_data = infection_rates.melt(id_vars='month', var_name='Age Group', value_name='Infection Rate')
# Create a pivot table for the heatmap
heatmap_pivot = heatmap_data.pivot(index='Age Group', columns='month', values='Infection Rate')
# Fill NaN values with 0
heatmap_pivot = heatmap_pivot.fillna(0)
# Normalize the heatmap data for column-wise coloring
normed_heatmap = (heatmap_pivot - heatmap_pivot.min()) / (heatmap_pivot.max() - heatmap_pivot.min())
# Filter to start from October 2020 and ensure consistent shapes
normed_heatmap = normed_heatmap.loc[:, '2020-10':'2024-09'] # Adjust according to your data's last month
heatmap_pivot_filtered = heatmap_pivot.loc[:, '2020-10':'2024-09'] # Filter the same way for annotations
# Create a heatmap with proper formatting
plt.figure(figsize=(12, 6)) # Increase figure size for better visibility
# Create the heatmap using normalized infection rates
sns.heatmap(normed_heatmap, cmap='viridis', cbar_kws={'label': 'Normalized Infection Rate', 'shrink': 0.5}, # Adjust shrink for colorbar
annot=heatmap_pivot_filtered.astype(int).replace(0, '0'), fmt='', # Use original values for annotations
linewidths=0.5, linecolor='white', square=True)
# Format x-axis labels to show just year-month
plt.xticks(ticks=range(len(normed_heatmap.columns)), labels=[str(month)[:7] for month in normed_heatmap.columns], rotation=45)
# Set title and labels
plt.title('COVID-19 Infection Rates (per 1,000 population) by Age Group and Month')
plt.xlabel('Month')
plt.ylabel('Age Group')
# Show the plot
plt.tight_layout()
plt.show()
Number of Vaccinated Persons by Vaccination Indications and Month
This analysis explores the distribution of vaccinated individuals across different priority groups over time. It highlights which groups received the highest vaccination coverage and how vaccination efforts were prioritized during different months. The results provide insights into vaccination patterns and trends, offering a clearer understanding of how various demographic and high-risk groups were targeted throughout the vaccination campaign.
# @title Number of Vaccinated Persons by Vaccination Priority Group and Month
import matplotlib.pyplot as plt
import seaborn as sns
# Dictionary for translating Latvian indications to English
translation_dict = {
"Persona ar hroniskām slimībām": "Person with chronic illnesses",
"Persona vecumā virs 60 gadiem": "Person aged over 60 years",
"Ārstniecības persona": "Medical professional",
"Cita paaugstinātā riska grupa": "Other high-risk group",
"Izglītības iestāžu darbinieks": "Educational institution worker",
"Cits iedzīvotājs": "Other resident",
"Ārstniecības iestādes darbinieks": "Healthcare facility worker",
"Kontakts ar personām ar hroniskām slimībām": "Contact with chronic patients",
"Operatīvo dienestu darbinieks": "Emergency services worker",
"Citas veselības indikācijas": "Other health indications",
"Pēc paša vēlēšanās": "By personal choice",
"Nozaru prioritāro iestāžu darbinieks": "Sector priority institution worker",
"Grūtniece": "Pregnant person",
"Vakcinēts ārzemēs": "Vaccinated abroad",
"Plānveida vakcinācija (pēc vakcinācijas kalendāra)": "Scheduled vaccination (per vaccination calendar)",
"Imūnsupresīva persona": "Immunosuppressed person",
"SAC darbinieks": "SAC worker",
"Ieslodzījumu vietu pārvaldes personāls": "Prison staff",
"SAC klients": "SAC client",
"Ieslodzītais": "Prisoner",
"Epidemioloģiskās indikācijas - kontakts ar infekcijas slimnieku": "Epidemiological indications - contact with infectious person",
"Speciālo iestāžu klients": "Special institution client",
"Epidemioloģiskās indikācijas - uzliesmojums, epidēmija": "Epidemiological indications - outbreak, epidemic",
"Pirms ceļojuma": "Before travel",
"Arodinfekciju profilakse": "Occupational infection prevention",
"Veselības indikācijas - trauma, zarnu trakta operācija": "Health indications - trauma, gastrointestinal surgery"
}
# number of vaccinated persons by vaccination indications
grouped = vaccination.groupby(['vaccination_indications', 'month'])['number_of_vaccinated_persons'].sum().reset_index().sort_values(by=['month', 'number_of_vaccinated_persons'], ascending=[True, False])
# Replace Latvian indications with English translations in the DataFrame
grouped['vaccination_indications'] = grouped['vaccination_indications'].replace(translation_dict)
# Calculate the total vaccinations per indication and sort in descending order
indication_order = grouped.groupby("vaccination_indications")['number_of_vaccinated_persons'].sum().sort_values(ascending=False).index
# Pivot the data for heatmap format
heatmap_data = grouped.pivot(index="vaccination_indications", columns="month", values="number_of_vaccinated_persons")
# Reorder the rows in the heatmap data to reflect the sorted indication order
heatmap_data = heatmap_data.reindex(indication_order)
# Plotting the heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(heatmap_data, annot=True, fmt=".0f", cmap="YlGnBu", cbar_kws={'label': 'Number of Vaccinated Persons'})
plt.title("Number of Vaccinated Persons by Vaccination Indications and Month")
plt.xlabel("Month")
plt.ylabel("Vaccination Indications")
plt.xticks(rotation=45)
plt.show()
Vaccination Uptake by Demographics
Analyze vaccination patterns in high-risk groups, such as older adults and individuals with chronic conditions, to assess prioritization strategies.
# @title Monthly Vaccinations by Priority Group for Top Vaccination Indications by Age Group
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
# Assuming we have the full initial DataFrame, which we'll call 'vaccination'
# Step 1: Aggregate data to include 'vaccination_indications', 'month', 'number_of_vaccinated_persons', and 'age_category'
grouped = vaccination.groupby(['vaccination_indications', 'month', 'age_category'], observed=True)['number_of_vaccinated_persons'].sum().reset_index()
# Translate vaccination indications to English
grouped['vaccination_indications'] = grouped['vaccination_indications'].replace(translation_dict)
# Step 2: Determine top 5 indications based on total vaccinations
top_indications = grouped.groupby('vaccination_indications')['number_of_vaccinated_persons'].sum().nlargest(5).index
# Step 3: Filter the data for the top 5 indications and make a copy
filtered_data = grouped[grouped['vaccination_indications'].isin(top_indications)].copy()
# Step 4: Sort age groups in ascending order and create subplots
age_groups = sorted(filtered_data['age_category'].unique()) # Sort age groups in ascending order
num_age_groups = len(age_groups)
# Adjusting plot size for readability
plt.figure(figsize=(12, 16))
palette = sns.color_palette("viridis", len(top_indications))
# Plot each age group in a subplot
for i, age_group in enumerate(age_groups):
plt.subplot((num_age_groups + 1) // 2, 2, i + 1) # Arrange subplots in a grid
age_data = filtered_data[filtered_data['age_category'] == age_group]
# Pivot data to make 'vaccination_indications' as columns for stacked bars
pivot_data = age_data.pivot(index='month', columns='vaccination_indications', values='number_of_vaccinated_persons').fillna(0)
# Filter columns to only include existing top indications for this age group
relevant_indications = [ind for ind in top_indications if ind in pivot_data.columns]
pivot_data = pivot_data[relevant_indications]
# Plot stacked bar chart
pivot_data.plot(kind='bar', stacked=True, ax=plt.gca(), color=palette[:len(relevant_indications)])
plt.title(f"Monthly Vaccinations by Priority Group for Age Group: {age_group}")
plt.xlabel("Month")
plt.ylabel("No. of Vaccinated Persons") # Abbreviated y-axis label for clarity
plt.xticks(rotation=45)
plt.legend(title="Priority Group", loc='upper left', fontsize='small')
plt.tight_layout()
plt.suptitle("Monthly Vaccinations for Top Vaccination Priority Groups by Age Group", y=1.02)
plt.show()
# @title Cumulative Vaccination Counts Over Time by Age Group for Each Vaccination Stage
import matplotlib.pyplot as plt
import pandas as pd
# Define adjusted population estimates with combined "0-19" group (for reference, but not used in calculations here)
population_estimates = {
'0-19': 200109 + 192730, # Combined population for 0-19 age group
'20-29': 180063,
'30-39': 268339,
'40-49': 250110,
'50-59': 257933,
'60-69': 247103,
'70+': 279370
}
# Convert the 'month' column to datetime to ensure compatibility with date filtering
vaccination['month'] = pd.to_datetime(vaccination['month'], errors='coerce')
# Define translations for stages
stage_translations = {
'1.pote': 'First Dose',
'2.pote': 'Second Dose',
'3.pote': 'Third Dose'
}
# Filter stages to include only translated doses, excluding "1.balstvakcinācija"
stages = [stage for stage in vaccination['vaccination_stage'].unique() if stage in stage_translations]
num_stages = len(stages)
# Define a common x-axis range for all plots
common_start_date = pd.to_datetime("2020-12")
common_end_date = pd.to_datetime("2021-09")
# Set up the plot with subplots for each stage, 1 plot per row
fig, axes = plt.subplots(num_stages, 1, figsize=(14, num_stages * 4), sharex=False)
for i, stage in enumerate(stages):
# Filter data for the current stage and date range
stage_data = vaccination[(vaccination['vaccination_stage'] == stage) &
(vaccination['month'] >= common_start_date) &
(vaccination['month'] <= common_end_date)]
if stage_data.empty:
print(f"No data available for stage {stage} in the specified date range.")
continue
# Aggregate data to get monthly vaccinations by age group
stage_summary = stage_data.groupby(['age_category', 'month'], observed=True)['number_of_vaccinated_persons'].sum().reset_index()
# Calculate cumulative vaccinations over time for this stage
stage_summary['cumulative_vaccinations'] = stage_summary.groupby('age_category', observed=True)['number_of_vaccinated_persons'].cumsum()
# Pivot data for stacked area chart
pivot_data = stage_summary.pivot(index='month', columns='age_category', values='cumulative_vaccinations').fillna(0)
# Plot stacked area chart in the subplot
pivot_data.plot(kind='area', stacked=True, colormap='viridis', alpha=0.7, ax=axes[i])
# Set titles and labels for each subplot using English translations
axes[i].set_title(f"Cumulative Vaccination Counts for {stage_translations[stage]} Over Time by Age Group")
axes[i].set_ylabel("Cumulative Vaccinations")
axes[i].set_xlabel("Month")
axes[i].set_xlim([common_start_date, common_end_date])
axes[i].legend(title="Age Group", bbox_to_anchor=(1.05, 1), loc='upper left')
axes[i].tick_params(axis='x', rotation=0) # Set rotation to 0 for better readability
# Overall title
plt.suptitle("Cumulative Vaccination Counts Over Time by Age Group for Each Vaccination Stage", y=1.02)
plt.tight_layout(rect=[0, 0.04, 1, 0.96])
plt.show()
Monthly Vaccination Trends by Gender
This analysis explores vaccination patterns over time for the 1st and 2nd doses, focusing on differences between male and female populations. It examines vaccination uptake trends and tracks cumulative coverage percentages to understand the progress toward immunization goals. By analyzing these patterns, we gain insights into gender-specific vaccination dynamics and assess how effectively different segments of the population were reached during the vaccination campaign.
# @title Monthly Vaccination Counts by Gender for 1st and 2nd Dose (with Cumulative Coverage in %)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
# Population estimates for cumulative coverage calculation
population_estimates = {
0: 869057, # Male population
1: 1006700 # Female population
}
# Convert 'vaccination_date' to datetime and extract month in 'YYYY-MM' format
vaccination['month'] = pd.to_datetime(vaccination['vaccination_date'], errors='coerce').dt.to_period('M')
# Filter for 1st and 2nd dose data separately
first_dose_data = vaccination[vaccination['vaccination_stage'] == '1.pote']
second_dose_data = vaccination[vaccination['vaccination_stage'] == '2.pote']
# Function to prepare data for plotting
def prepare_data(dose_data, population_estimates):
# Group by month and gender, summing vaccination counts
summary = dose_data.groupby(['month', 'vaccinated_person_gender'])['number_of_vaccinated_persons'].sum().reset_index()
# Calculate cumulative count and coverage
summary['cumulative_vaccinated'] = summary.groupby('vaccinated_person_gender')['number_of_vaccinated_persons'].cumsum()
summary['cumulative_coverage'] = summary.apply(
lambda row: (row['cumulative_vaccinated'] / population_estimates[row['vaccinated_person_gender']]) * 100,
axis=1
)
# Pivot data for heatmap format and cumulative coverage for annotations
heatmap_data = summary.pivot(index='vaccinated_person_gender', columns='month', values='number_of_vaccinated_persons').fillna(0)
cumulative_coverage_data = summary.pivot(index='vaccinated_person_gender', columns='month', values='cumulative_coverage').fillna(0)
# Create annotations with cumulative coverage in parentheses
annotations = heatmap_data.astype(int).astype(str) + "\n(" + cumulative_coverage_data.round(2).astype(str) + "%)"
return heatmap_data, annotations
# Prepare data for 1st dose
first_dose_heatmap_data, first_dose_annotations = prepare_data(first_dose_data, population_estimates)
# Prepare data for 2nd dose
second_dose_heatmap_data, second_dose_annotations = prepare_data(second_dose_data, population_estimates)
# Plotting both heatmaps
fig, axes = plt.subplots(2, 1, figsize=(12, 8))
# 1st Dose Heatmap
sns.heatmap(first_dose_heatmap_data, cmap="viridis", annot=first_dose_annotations, fmt="", ax=axes[0], cbar_kws={'label': 'Number of Vaccinated Persons'})
axes[0].set_title("Monthly 1st Dose Vaccination Counts by Gender (with Cumulative Coverage in %)")
axes[0].set_xlabel("Month")
axes[0].set_ylabel("Gender")
axes[0].set_xticklabels(first_dose_heatmap_data.columns.astype(str), rotation=45)
axes[0].set_yticklabels(["Male", "Female"], rotation=0)
# 2nd Dose Heatmap
sns.heatmap(second_dose_heatmap_data, cmap="viridis", annot=second_dose_annotations, fmt="", ax=axes[1], cbar_kws={'label': 'Number of Vaccinated Persons'})
axes[1].set_title("Monthly 2nd Dose Vaccination Counts by Gender (with Cumulative Coverage in %)")
axes[1].set_xlabel("Month")
axes[1].set_ylabel("Gender")
axes[1].set_xticklabels(second_dose_heatmap_data.columns.astype(str), rotation=45)
axes[1].set_yticklabels(["Male", "Female"], rotation=0)
plt.tight_layout()
plt.show()
Monthly Vaccination Trends by Age Group and Gender
This analysis examines the distribution of 1st and 2nd dose vaccinations across age groups for males and females over time. It provides insights into how vaccination efforts were prioritized among different age demographics and highlights gender-based differences in vaccination uptake. By tracking cumulative vaccination coverage as a percentage of the population, this analysis also evaluates the progress made in reaching immunization targets for various age groups.
# @title Monthly Vaccination Counts by Age Group for 1st Dose (Male and Female, with Cumulative Coverage in %)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
# Aggregated population estimates to match the age categories in the vaccination data
population_estimates_age_gender = {
'0-19': {'male': 48494 + 55030 + 51205 + 47664, 'female': 45296 + 51289 + 48678 + 45183},
'20-29': {'male': 42739 + 50490, 'female': 40144 + 46690},
'30-39': {'male': 68879 + 69622, 'female': 64057 + 65781},
'40-49': {'male': 61158 + 62894, 'female': 60805 + 65253},
'50-59': {'male': 61275 + 59520, 'female': 67428 + 69710},
'60-69': {'male': 59254 + 45217, 'female': 75596 + 67036},
'70+': {'male': 34981 + 21970, 'female': 61032 + 48034}
}
# Convert 'vaccination_date' to datetime and extract month in 'YYYY-MM' format
vaccination['month'] = pd.to_datetime(vaccination['vaccination_date'], errors='coerce').dt.to_period('M')
# Filter for 1st dose only
first_dose_data = vaccination[vaccination['vaccination_stage'] == '1.pote']
# Function to prepare data for plotting by age group and gender
def prepare_data_age_group(dose_data, gender_label):
# Filter by gender
data = dose_data[dose_data['vaccinated_person_gender'] == (0 if gender_label == 'male' else 1)]
# Group by month and age category, summing vaccination counts
summary = data.groupby(['month', 'age_category'], observed=True)['number_of_vaccinated_persons'].sum().reset_index()
# Calculate cumulative count and coverage
summary['cumulative_vaccinated'] = summary.groupby('age_category', observed=True)['number_of_vaccinated_persons'].cumsum()
summary['cumulative_coverage'] = summary.apply(
lambda row: (row['cumulative_vaccinated'] / population_estimates_age_gender[row['age_category']][gender_label]) * 100,
axis=1
)
# Pivot data for heatmap format and cumulative coverage for annotations
heatmap_data = summary.pivot(index='age_category', columns='month', values='number_of_vaccinated_persons').fillna(0)
cumulative_coverage_data = summary.pivot(index='age_category', columns='month', values='cumulative_coverage').fillna(0)
# Create annotations with cumulative coverage in parentheses
annotations = heatmap_data.astype(int).astype(str) + "\n(" + cumulative_coverage_data.round(2).astype(str) + "%)"
return heatmap_data, annotations
# Prepare data for 1st dose, males only
male_first_dose_heatmap_data, male_first_dose_annotations = prepare_data_age_group(first_dose_data, 'male')
# Prepare data for 1st dose, females only
female_first_dose_heatmap_data, female_first_dose_annotations = prepare_data_age_group(first_dose_data, 'female')
# Plotting both heatmaps in separate rows
fig, axes = plt.subplots(2, 1, figsize=(12, 10))
# Male Heatmap
sns.heatmap(male_first_dose_heatmap_data, cmap="viridis", annot=male_first_dose_annotations, fmt="", ax=axes[0], cbar_kws={'label': 'Number of Vaccinated Persons'})
axes[0].set_title("Monthly 1st Dose Vaccination Counts by Age Group (Males, with Cumulative Coverage in %)")
axes[0].set_xlabel("Month")
axes[0].set_ylabel("Age Group")
axes[0].set_xticklabels(male_first_dose_heatmap_data.columns.astype(str), rotation=45)
# Female Heatmap
sns.heatmap(female_first_dose_heatmap_data, cmap="viridis", annot=female_first_dose_annotations, fmt="", ax=axes[1], cbar_kws={'label': 'Number of Vaccinated Persons'})
axes[1].set_title("Monthly 1st Dose Vaccination Counts by Age Group (Females, with Cumulative Coverage in %)")
axes[1].set_xlabel("Month")
axes[1].set_ylabel("Age Group")
axes[1].set_xticklabels(female_first_dose_heatmap_data.columns.astype(str), rotation=45)
plt.tight_layout()
plt.show()
Vaccine Dosage and Preparation Efficiency
Analyze the doses and types of vaccines administered over time. Investigate whether specific periods showed a preference for certain vaccine preparations or exhibited shifts in dosage amounts.
# @title Monthly Number of Vaccinated Persons by Preparation Type
import matplotlib.pyplot as plt
from matplotlib.cm import viridis
import pandas as pd
# Mapping dictionary
preparation_mapping = {
"BBIBP-CorV (Sinopharm)": "Sinopharm",
"Comirnaty": "Pfizer-BioNTech",
"CoronaVac(Sinovac)": "Sinovac",
"Covshield(ChAdOx1_nCov-19)": "AstraZeneca",
"Jcovden": "J&J Janssen",
"Spikevax": "Moderna",
"Vaxzevria": "AstraZeneca"
}
# Apply the mapping to replace technical names with commonly known names
vaccination['preparation'] = vaccination['preparation'].replace(preparation_mapping)
# Create a 'year_month' column from 'vaccination_date' in 'YYYY-MM' format
vaccination['year_month'] = pd.to_datetime(vaccination['vaccination_date'], errors='coerce').dt.to_period('M')
# Group by year_month and the updated preparation column
monthly_vaccination_counts = vaccination.groupby(['year_month', 'preparation'])['number_of_vaccinated_persons'].sum().unstack(fill_value=0)
# Generate colors from the Viridis palette
num_preparations = len(monthly_vaccination_counts.columns)
colors = [viridis(i / num_preparations) for i in range(num_preparations)]
# Plotting with adjusted figure size for a smaller height
fig, ax = plt.subplots(figsize=(12, 6))
monthly_vaccination_counts.plot(kind='bar', stacked=True, color=colors, width=0.8, ax=ax)
# Apply chart formatting
ax.set_title('Monthly Number of Vaccinated Persons by Brand Vaccine Names')
ax.set_xlabel('Year-Month')
ax.set_ylabel('Number of Vaccinated Persons')
# Display ticks for each month
ax.set_xticks(range(len(monthly_vaccination_counts.index)))
ax.set_xticklabels([str(period) for period in monthly_vaccination_counts.index], rotation=45)
# Adjust legend
plt.legend(title='Known Vaccine Names', bbox_to_anchor=(1.05, 1), loc='upper left', fontsize='small', title_fontsize='medium')
# Customize the chart by hiding all spines except the bottom
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['bottom'].set_visible(True)
plt.tight_layout()
plt.show()
Vaccine Distribution Across Age Groups
This analysis examines the variation in vaccine preferences among different age groups. By analyzing the proportional distribution of vaccine preparations, we aim to understand patterns in vaccine administration and potential prioritization strategies for different demographics. This provides insights into whether certain vaccine types were favored for specific age groups and helps evaluate the effectiveness and equity of vaccine allocation policies.
# @title Proportional Stacked Bar Plot of Vaccine Choice Within Each Age Group
import matplotlib.pyplot as plt
import pandas as pd
# Apply the mapping to replace technical names with commonly known names
vaccination['preparation'] = vaccination['preparation'].replace(preparation_mapping)
# Group by age_category and preparation to get the total number of vaccinated persons
age_vaccine_counts = vaccination.groupby(['age_category', 'preparation'], observed=False)['number_of_vaccinated_persons'].sum().unstack(fill_value=0)
# Normalize the counts within each age group to get proportions
age_vaccine_proportions = age_vaccine_counts.div(age_vaccine_counts.sum(axis=1), axis=0)
# Generate colors from the Viridis palette
num_preparations = len(age_vaccine_proportions.columns)
colors = [viridis(i / num_preparations) for i in range(num_preparations)]
# Plotting
fig, ax = plt.subplots(figsize=(12, 8))
age_vaccine_proportions.plot(kind='bar', stacked=True, color=colors, ax=ax)
# Apply chart formatting
ax.set_title('Proportion of Vaccine Preparations by Age Category')
ax.set_xlabel('Age Category')
ax.set_ylabel('Proportion of Vaccinated Persons')
# Format y-axis as percentages with a tick for 100%
ax.yaxis.set_major_formatter(plt.matplotlib.ticker.PercentFormatter(1.0))
ax.set_yticks([0, 0.2, 0.4, 0.6, 0.8, 1.0]) # Adds 100% (1.0) as a y-tick
# Adjust legend
plt.legend(title='Vaccine Type', bbox_to_anchor=(1.05, 1), loc='upper left', fontsize='small', title_fontsize='medium')
# Customize the chart by hiding all spines except the bottom
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['bottom'].set_visible(True)
plt.tight_layout()
plt.show()
# @title Monthly Vaccination Breakdown by Preparation Type for Each Age Category
import matplotlib.pyplot as plt
import pandas as pd
# Apply the mapping to replace technical names with commonly known names
vaccination['preparation'] = vaccination['preparation'].replace(preparation_mapping)
# Ensure 'vaccination_date' is in datetime format and extract month
vaccination['vaccination_date'] = pd.to_datetime(vaccination['vaccination_date'])
vaccination['month'] = vaccination['vaccination_date'].dt.to_period('M')
# Group by month, age_category, and preparation to get the total number of vaccinated persons, with observed=False to suppress warning
monthly_age_vaccine_counts = vaccination.groupby(['month', 'age_category', 'preparation'], observed=False)['number_of_vaccinated_persons'].sum().unstack(fill_value=0)
# Set up subplots for each age category
age_categories = monthly_age_vaccine_counts.index.get_level_values('age_category').unique()
num_age_categories = len(age_categories)
fig, axes = plt.subplots(num_age_categories, 1, figsize=(14, 4 * num_age_categories), sharex=True)
# Generate colors from the Viridis palette
num_preparations = len(monthly_age_vaccine_counts.columns)
colors = [viridis(i / num_preparations) for i in range(num_preparations)]
# Plotting for each age category
for i, age_category in enumerate(age_categories):
ax = axes[i]
data = monthly_age_vaccine_counts.xs(age_category, level='age_category')
data.plot(kind='bar', stacked=True, color=colors, ax=ax, width=0.8)
# Formatting each subplot
ax.set_title(f'Age Category: {age_category}')
ax.set_ylabel('Number of Vaccinated Persons')
ax.legend(title='Vaccine Type', fontsize='small', title_fontsize='medium', loc='upper left')
# Adjust y-axis to automatically scale based on the data
ax.get_yaxis().set_major_formatter(plt.matplotlib.ticker.ScalarFormatter())
# Customize the chart by hiding all spines except the bottom
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['bottom'].set_visible(True)
# Set common labels and tighten layout
axes[-1].set_xlabel('Month')
plt.tight_layout()
plt.show()
Hypothetical Herd Immunity Thresholds
Model the potential achievement of herd immunity by estimating the combined immunity from vaccination coverage and recovery from confirmed cases. Analyze whether the level of immunity in the population approached or surpassed theoretical thresholds at any point, and evaluate the implications for controlling the spread of COVID-19.
# @title COVID-19 Vaccination Coverage for First Dose Visualization
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.dates as mdates
# Assuming 'vaccination' dataframe is provided
# Step 1: Filter for '1.pote' in the vaccination stage and remove NaNs in age_category
vaccination_first_dose = vaccination[vaccination['vaccination_stage'] == '1.pote'].copy()
vaccination_first_dose = vaccination_first_dose.dropna(subset=['age_category'])
vaccination_first_dose['age_category'] = vaccination_first_dose['age_category'].astype(str)
# Filter out any remaining unexpected NaN values in age_category directly
vaccination_first_dose = vaccination_first_dose[vaccination_first_dose['age_category'] != 'nan']
# Aggregate the number of vaccinated persons by age_category and month
vaccination_agg = vaccination_first_dose.groupby(['month', 'age_category']).agg(
num_vaccinated=('number_of_vaccinated_persons', 'sum')
).reset_index()
# Define population estimates and map to age categories
population_estimates_age_gender = {
'0-19': {'male': 48494 + 55030 + 51205 + 47664, 'female': 45296 + 51289 + 48678 + 45183},
'20-29': {'male': 42739 + 50490, 'female': 40144 + 46690},
'30-39': {'male': 68879 + 69622, 'female': 64057 + 65781},
'40-49': {'male': 61158 + 62894, 'female': 60805 + 65253},
'50-59': {'male': 61275 + 59520, 'female': 67428 + 69710},
'60-69': {'male': 59254 + 45217, 'female': 75596 + 67036},
'70+': {'male': 34981 + 21970, 'female': 61032 + 48034}
}
population_totals = {age: sum(gender.values()) for age, gender in population_estimates_age_gender.items()}
vaccination_agg['estimated_population'] = vaccination_agg['age_category'].map(population_totals)
# Convert 'month' to datetime format and calculate cumulative coverage
vaccination_agg['month'] = pd.to_datetime(vaccination_agg['month'].astype(str))
vaccination_agg = vaccination_agg.sort_values(by=['age_category', 'month'])
vaccination_agg['cumulative_vaccinated'] = vaccination_agg.groupby('age_category')['num_vaccinated'].cumsum()
vaccination_agg['coverage_per_cent'] = (vaccination_agg['cumulative_vaccinated'] / vaccination_agg['estimated_population']) * 100
# Prepare the vaccination coverage result table
result_table = vaccination_agg[['month', 'age_category', 'coverage_per_cent']]
# Step 4: Set up plot style
sns.set_palette(sns.color_palette("viridis", n_colors=result_table['age_category'].nunique()))
# Initialize the plot
fig, ax1 = plt.subplots(figsize=(12, 6))
# Plot vaccination coverage for each age category on the primary y-axis
sns.lineplot(data=result_table, x='month', y='coverage_per_cent', hue='age_category', marker='o', ax=ax1)
ax1.set_ylim(0, 100) # Set y-axis limits for coverage
# Adding herd immunity threshold lines and slightly shifting text to the right
herd_immunity_thresholds = [60, 70, 85]
for threshold in herd_immunity_thresholds:
ax1.axhline(y=threshold, color='gray', linestyle='--', linewidth=1)
ax1.text(result_table['month'].min() + pd.Timedelta(days=10), threshold + 1, f'{threshold}%', color='gray')
# Customize the plot
ax1.set_title('COVID-19 Vaccination Coverage for First Dose by Age Category')
ax1.set_xlabel('Month')
ax1.set_ylabel('Vaccination Coverage (%)')
ax1.legend(title='Age Category', loc='upper left', bbox_to_anchor=(1, 1)) # Move legend outside plot area
ax1.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
# Remove grid, ensure no gridlines appear on both axes
ax1.grid(False)
# Hide all but bottom spines as specified
for spine in ['top', 'right', 'left']:
ax1.spines[spine].set_visible(False)
plt.tight_layout()
plt.show()
# @title COVID-19 Vaccination Coverage for Second Dose Visualization
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.dates as mdates
# Assuming 'vaccination' dataframe is provided
# Step 1: Filter for '2.pote' in the vaccination stage and remove NaNs in age_category
vaccination_second_dose = vaccination[vaccination['vaccination_stage'] == '2.pote'].copy()
vaccination_second_dose = vaccination_second_dose.dropna(subset=['age_category'])
vaccination_second_dose['age_category'] = vaccination_second_dose['age_category'].astype(str)
# Filter out any remaining unexpected NaN values in age_category directly
vaccination_second_dose = vaccination_second_dose[vaccination_second_dose['age_category'] != 'nan']
# Aggregate the number of vaccinated persons by age_category and month
vaccination_agg = vaccination_second_dose.groupby(['month', 'age_category']).agg(
num_vaccinated=('number_of_vaccinated_persons', 'sum')
).reset_index()
# Define population estimates and map to age categories
population_estimates_age_gender = {
'0-19': {'male': 48494 + 55030 + 51205 + 47664, 'female': 45296 + 51289 + 48678 + 45183},
'20-29': {'male': 42739 + 50490, 'female': 40144 + 46690},
'30-39': {'male': 68879 + 69622, 'female': 64057 + 65781},
'40-49': {'male': 61158 + 62894, 'female': 60805 + 65253},
'50-59': {'male': 61275 + 59520, 'female': 67428 + 69710},
'60-69': {'male': 59254 + 45217, 'female': 75596 + 67036},
'70+': {'male': 34981 + 21970, 'female': 61032 + 48034}
}
population_totals = {age: sum(gender.values()) for age, gender in population_estimates_age_gender.items()}
vaccination_agg['estimated_population'] = vaccination_agg['age_category'].map(population_totals)
# Convert 'month' to datetime format and calculate cumulative coverage
vaccination_agg['month'] = pd.to_datetime(vaccination_agg['month'].astype(str))
vaccination_agg = vaccination_agg.sort_values(by=['age_category', 'month'])
vaccination_agg['cumulative_vaccinated'] = vaccination_agg.groupby('age_category')['num_vaccinated'].cumsum()
vaccination_agg['coverage_per_cent'] = (vaccination_agg['cumulative_vaccinated'] / vaccination_agg['estimated_population']) * 100
# Prepare the vaccination coverage result table
result_table = vaccination_agg[['month', 'age_category', 'coverage_per_cent']]
# Step 4: Set up plot style
sns.set_palette(sns.color_palette("viridis", n_colors=result_table['age_category'].nunique()))
# Initialize the plot
fig, ax1 = plt.subplots(figsize=(12, 6))
# Plot vaccination coverage for each age category on the primary y-axis
sns.lineplot(data=result_table, x='month', y='coverage_per_cent', hue='age_category', marker='o', ax=ax1)
ax1.set_ylim(0, 100) # Set y-axis limits for coverage
# Adding herd immunity threshold lines and slightly shifting text to the right
herd_immunity_thresholds = [60, 70, 85]
for threshold in herd_immunity_thresholds:
ax1.axhline(y=threshold, color='gray', linestyle='--', linewidth=1)
ax1.text(result_table['month'].min() + pd.Timedelta(days=10), threshold + 1, f'{threshold}%', color='gray')
# Customize the plot
ax1.set_title('COVID-19 Vaccination Coverage for Second Dose')
ax1.set_xlabel('Month')
ax1.set_ylabel('Vaccination Coverage (%)')
ax1.legend(title='Age Category', loc='upper left', bbox_to_anchor=(1, 1)) # Move legend outside plot area
ax1.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
# Remove grid, ensure no gridlines appear on both axes
ax1.grid(False)
# Hide all but bottom spines as specified
for spine in ['top', 'right', 'left']:
ax1.spines[spine].set_visible(False)
plt.tight_layout()
plt.show()
# @title COVID-19 Vaccination Coverage for Second Dose with Recovery Index
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.dates as mdates
# Step 1: Prepare output_table for recovery index
# Assuming 'covid' DataFrame is available
# Ensure 'date' is in datetime format
covid['date'] = pd.to_datetime(covid['date'], errors='coerce')
# Extract year-month for grouping
covid['year_month'] = covid['date'].dt.to_period('M')
# Aggregate to get the last value of cumulative recovered cases each month
# and calculate the cumulative sum for confirmed cases
monthly_summary = covid.groupby('year_month').agg(
cumulative_recovered_cases=('cumulative_recovered_cases', 'last'),
total_cases=('confirmed_covid19_cases', 'sum')
).reset_index()
# Convert 'year_month' to datetime format if it is in Period format
if monthly_summary['year_month'].dtype == 'period[M]':
monthly_summary['month'] = monthly_summary['year_month'].dt.to_timestamp()
# Calculate cumulative sum for total cases
monthly_summary['cumulative_total_cases'] = monthly_summary['total_cases'].cumsum()
# Calculate recovery index as a percentage
monthly_summary['recovery_index'] = (monthly_summary['cumulative_recovered_cases'] / monthly_summary['cumulative_total_cases']) * 100
# Define the vaccination period
vaccination_period_start = pd.to_datetime("2020-12-01")
vaccination_period_end = pd.to_datetime("2021-09-30")
# Filter the table for the vaccination period
output_table = monthly_summary[(monthly_summary['month'] >= vaccination_period_start) &
(monthly_summary['month'] <= vaccination_period_end)].reset_index(drop=True)
# Step 2: Prepare vaccination coverage data for the same period
# Assuming 'vaccination' DataFrame contains 'vaccination_stage', 'month', 'age_category', and 'number_of_vaccinated_persons'
# Convert 'month' to datetime format if it is an object or Period format
if vaccination['month'].dtype == 'object':
vaccination['month'] = pd.to_datetime(vaccination['month'], errors='coerce')
elif vaccination['month'].dtype == 'period[M]':
vaccination['month'] = vaccination['month'].dt.to_timestamp()
# Filter the vaccination data for the second dose within the vaccination period
vaccination_second_dose = vaccination[(vaccination['vaccination_stage'] == '2.pote') &
(vaccination['month'] >= vaccination_period_start) &
(vaccination['month'] <= vaccination_period_end)].copy()
# Remove NaNs in age_category and convert to string
vaccination_second_dose = vaccination_second_dose.dropna(subset=['age_category'])
vaccination_second_dose['age_category'] = vaccination_second_dose['age_category'].astype(str)
# Aggregate the number of vaccinated persons by age_category and month
vaccination_agg = vaccination_second_dose.groupby(['month', 'age_category']).agg(
num_vaccinated=('number_of_vaccinated_persons', 'sum')
).reset_index()
# Define population estimates and map to age categories
population_estimates_age_gender = {
'0-19': {'male': 48494 + 55030 + 51205 + 47664, 'female': 45296 + 51289 + 48678 + 45183},
'20-29': {'male': 42739 + 50490, 'female': 40144 + 46690},
'30-39': {'male': 68879 + 69622, 'female': 64057 + 65781},
'40-49': {'male': 61158 + 62894, 'female': 60805 + 65253},
'50-59': {'male': 61275 + 59520, 'female': 67428 + 69710},
'60-69': {'male': 59254 + 45217, 'female': 75596 + 67036},
'70+': {'male': 34981 + 21970, 'female': 61032 + 48034}
}
population_totals = {age: sum(gender.values()) for age, gender in population_estimates_age_gender.items()}
vaccination_agg['estimated_population'] = vaccination_agg['age_category'].map(population_totals)
# Convert 'month' to datetime format if it is not already, then calculate cumulative coverage
vaccination_agg = vaccination_agg.sort_values(by=['age_category', 'month'])
vaccination_agg['cumulative_vaccinated'] = vaccination_agg.groupby('age_category')['num_vaccinated'].cumsum()
vaccination_agg['coverage_per_cent'] = (vaccination_agg['cumulative_vaccinated'] / vaccination_agg['estimated_population']) * 100
# Prepare the vaccination coverage result table
result_table = vaccination_agg[['month', 'age_category', 'coverage_per_cent']]
# Step 3: Plot vaccination coverage and recovery index
# Initialize the plot
fig, ax1 = plt.subplots(figsize=(12, 6))
# Plot vaccination coverage for each age category on the primary y-axis using viridis color palette
sns.lineplot(data=result_table, x='month', y='coverage_per_cent', hue='age_category', marker='o', palette='viridis', ax=ax1)
ax1.set_ylim(0, 100) # Set y-axis limits for coverage
# Adding herd immunity threshold lines and slightly shifting text to the right
herd_immunity_thresholds = [60, 70, 85]
for threshold in herd_immunity_thresholds:
ax1.axhline(y=threshold, color='gray', linestyle='--', linewidth=1)
ax1.text(result_table['month'].min() + pd.Timedelta(days=10), threshold + 1, f'{threshold}%', color='gray')
# Customize the primary y-axis plot
ax1.set_title('COVID-19 Vaccination Coverage for Second Dose with Recovery Index')
ax1.set_xlabel('Month')
ax1.set_ylabel('Vaccination Coverage (%)')
# Move the legend further outside to avoid overlap with the secondary y-axis ticks
ax1.legend(title='Age Category', loc='upper left', bbox_to_anchor=(1.05, 1))
ax1.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
# Secondary y-axis for recovery index with a black dashed line
ax2 = ax1.twinx()
sns.lineplot(data=output_table, x='month', y='recovery_index', color='black', linestyle='--', marker='x', ax=ax2, label='Recovery Index (%)')
ax2.set_ylabel('Recovery Index (%)')
ax2.tick_params(axis='y') # Default color for secondary y-axis ticks
# Remove top, left, and right spines
for spine in ['top', 'right', 'left']:
ax1.spines[spine].set_visible(False)
ax2.spines[spine].set_visible(False)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
Hypothetical Herd Immunity Thresholds
Model the potential achievement of herd immunity by estimating the combined immunity from vaccination coverage and recovery from confirmed cases. Analyze whether the level of immunity in the population approached or surpassed theoretical thresholds at any point, and evaluate the implications for controlling the spread of COVID-19.
# @title Vaccinated Persons by Facility and Age Category
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
# Silence warnings
warnings.filterwarnings("ignore", category=FutureWarning)
# Assuming your vaccination data is in a DataFrame named 'vaccination'
# Step 1: Select the top 25 facilities by vaccination counts and clean facility names
top_25_facilities = vaccination['vaccination_facility_name'].value_counts().nlargest(25).index
vaccination['facility_group'] = vaccination['vaccination_facility_name'].apply(lambda x: x.split(',')[0] if x in top_25_facilities else None)
# Filter out rows where 'facility_group' is None (i.e., "Other" facilities)
filtered_vaccination = vaccination.dropna(subset=['facility_group'])
# Step 2: Group by the cleaned facility name and age category to get the sum of vaccinated persons
facility_age_data = filtered_vaccination.groupby(['facility_group', 'age_category'], observed=True)['number_of_vaccinated_persons'].sum().unstack(fill_value=0)
# Step 3: Order facilities by total vaccinated persons in descending order
facility_age_data['Total_Vaccinated'] = facility_age_data.sum(axis=1)
facility_age_data = facility_age_data.sort_values('Total_Vaccinated', ascending=False).drop(columns='Total_Vaccinated')
# Step 4: Normalize each row for row-wise color intensity
facility_age_data_normalized = facility_age_data.div(facility_age_data.max(axis=1), axis=0)
# Step 5: Plot the heatmap using viridis colormap, which is row-normalized
plt.figure(figsize=(12, 10))
sns.heatmap(facility_age_data_normalized, cmap="viridis", annot=facility_age_data, fmt="d",
cbar_kws={'label': 'Normalized Vaccinated Persons'}, linewidths=.5)
plt.title("Number of Vaccinated Persons by Facility and Age Category")
plt.xlabel("Age Category")
plt.ylabel("Vaccination Facility (Top 25, Ordered)")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
# @title Vaccinated Persons by Facility and Month
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
# Silence warnings
warnings.filterwarnings("ignore", category=FutureWarning)
# Assuming your vaccination data is in a DataFrame named 'vaccination'
# Convert 'month' column to datetime format if it isn't already, then format it to YYYY-MM
vaccination['month'] = pd.to_datetime(vaccination['month']).dt.strftime('%Y-%m')
# Step 1: Select the top 25 facilities by vaccination counts and clean facility names
top_25_facilities = vaccination['vaccination_facility_name'].value_counts().nlargest(25).index
vaccination['facility_group'] = vaccination['vaccination_facility_name'].apply(lambda x: x.split(',')[0] if x in top_25_facilities else None)
# Filter out rows where 'facility_group' is None (i.e., "Other" facilities)
filtered_vaccination = vaccination.dropna(subset=['facility_group'])
# Step 2: Group by the cleaned facility name and formatted month to get the sum of vaccinated persons
facility_month_data = filtered_vaccination.groupby(['facility_group', 'month'], observed=True)['number_of_vaccinated_persons'].sum().unstack(fill_value=0)
# Step 3: Order facilities by total vaccinated persons in descending order
facility_month_data['Total_Vaccinated'] = facility_month_data.sum(axis=1)
facility_month_data = facility_month_data.sort_values('Total_Vaccinated', ascending=False).drop(columns='Total_Vaccinated')
# Step 4: Normalize each row for row-wise color intensity
facility_month_data_normalized = facility_month_data.div(facility_month_data.max(axis=1), axis=0)
# Step 5: Plot the heatmap using viridis colormap, which is row-normalized
plt.figure(figsize=(12, 10))
sns.heatmap(facility_month_data_normalized, cmap="viridis", annot=facility_month_data, fmt="d",
cbar_kws={'label': 'Normalized Vaccinated Persons'}, linewidths=.5)
plt.title("Number of Vaccinated Persons by Facility and Month")
plt.xlabel("Month")
plt.ylabel("Vaccination Facility (Top 25, Ordered)")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
Evaluating Deep Learning Models for COVID-19 Incidence Forecasting
This analysis evaluates the performance of four deep learning models—RNN, LSTM, BiLSTM, and GRU—for forecasting daily COVID-19 cases. Each model's predictions were assessed against actual case data using metrics such as RMSE, MAE, MAPE, RMSLE, and explained variance. The LSTM and GRU models consistently outperformed RNN and BiLSTM in terms of lower errors and higher explained variance, suggesting their superior ability to capture temporal patterns in the data.
While the LSTM model achieved the best overall balance of accuracy across multiple metrics, all models exhibited some level of deviation, particularly during periods of rapid case fluctuations. The higher RMSLE values suggest that the models struggle to accurately predict logarithmic-scale variations in the data, which might stem from data limitations, such as insufficient granularity or missing contextual features like mobility data, lockdown measures, or vaccination rates.
These results indicate that while the models provide a reasonable approximation of case trends, achieving higher accuracy may require more granular data, such as demographic-specific case counts, region-wise infection data, or behavioral indicators. Incorporating such detailed features could potentially improve the models' predictive performance and their ability to generalize across different phases of the pandemic.
# @title Evaluating Deep Learning Models for COVID-19 Daily Incidence Data Forecasting
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, LSTM, GRU, Dense, Input, Bidirectional
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, explained_variance_score
import tensorflow as tf
import logging
import warnings
# Suppress all warnings
warnings.filterwarnings("ignore")
# Suppress TensorFlow-specific logging
logging.getLogger('tensorflow').setLevel(logging.ERROR)
tf.get_logger().setLevel('ERROR')
# Configure model parameters
params = {
"learning_rate": 0.0005,
"epochs": 100, # Fixed 100 epochs for training
"hidden_units": 16,
"sequence_length": 5
}
# Load and preprocess data
covid['date'] = pd.to_datetime(covid['date'])
covid.fillna(method='ffill', inplace=True)
covid.fillna(method='bfill', inplace=True)
data = covid[['confirmed_covid19_cases', 'recovered_cases', 'number_of_tests']]
scaler = MinMaxScaler(feature_range=(0, 1))
data_scaled = scaler.fit_transform(data)
data_scaled = pd.DataFrame(data_scaled, columns=['confirmed', 'recovered', 'tests'])
# Helper function to create sequences
def create_sequences(data, sequence_length=5):
sequences, labels = [], []
for i in range(len(data) - sequence_length):
sequences.append(data.iloc[i:i + sequence_length].values)
labels.append(data['confirmed'].iloc[i + sequence_length])
return np.array(sequences), np.array(labels)
# Prepare sequences
X, y = create_sequences(data_scaled, params["sequence_length"])
# Split data into train and test sets (80% train, 20% test)
train_size = int(0.8 * len(X))
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]
# Model storage, loss, and metrics tracking
models = {}
loss_history = {}
metrics = {}
predictions = {}
# Adjusted evaluation function with RMSLE guard
def evaluate_model(model, X_test, y_test, model_name):
predicted = model.predict(X_test)
if predicted.ndim == 3:
predicted = predicted[:, -1, :]
if predicted.shape[1] != 1:
predicted = predicted.reshape(-1, 1)
predicted_cases = scaler.inverse_transform(np.concatenate([predicted, np.zeros((predicted.shape[0], 2))], axis=1))[:, 0]
actual_cases = scaler.inverse_transform(np.concatenate([y_test.reshape(-1, 1), np.zeros((y_test.shape[0], 2))], axis=1))[:, 0]
# Calculate standard metrics
rmse = np.sqrt(mean_squared_error(actual_cases, predicted_cases))
mae = mean_absolute_error(actual_cases, predicted_cases)
mape = mean_absolute_percentage_error(actual_cases, predicted_cases)
ev = explained_variance_score(actual_cases, predicted_cases)
# Add a small constant to avoid log(0) errors
actual_cases = np.maximum(actual_cases, 1e-9)
predicted_cases = np.maximum(predicted_cases, 1e-9)
# Calculate RMSLE safely
rmsle = np.sqrt(mean_squared_error(np.log1p(actual_cases), np.log1p(predicted_cases)))
metrics[model_name] = {'RMSE': rmse, 'MAE': mae, 'MAPE': mape, 'EV': ev, 'RMSLE': rmsle}
predictions[model_name] = predicted_cases # Store predictions for later plotting
# Function to build, train, and evaluate model
def build_train_evaluate(model_name, layer):
print(f"\nTraining {model_name} Model...")
model = Sequential([
Input(shape=(X_train.shape[1], X_train.shape[2])),
layer,
Dense(1)
])
model.compile(optimizer=Adam(learning_rate=params["learning_rate"]), loss='mse')
# Train with fixed 100 epochs
history = model.fit(
X_train, y_train,
epochs=params["epochs"], batch_size=32,
validation_data=(X_test, y_test),
verbose=0
)
models[model_name] = model
loss_history[model_name] = history.history['loss']
evaluate_model(model, X_test, y_test, model_name)
# Train, evaluate, and store metrics for RNN, LSTM, GRU, and BiLSTM models
for model_name, layer in zip(
['RNN', 'LSTM', 'BiLSTM', 'GRU'],
[SimpleRNN(params["hidden_units"], activation='relu'),
LSTM(params["hidden_units"], activation='relu'),
Bidirectional(LSTM(params["hidden_units"], activation='relu')),
GRU(params["hidden_units"], activation='relu')]
):
build_train_evaluate(model_name, layer)
### Plot Only Training Loss over Epochs for All Models in One Chart ###
plt.figure(figsize=(12, 6))
for model_name, train_loss in loss_history.items():
plt.plot(train_loss, label=f'{model_name} Training Loss')
plt.title("Evaluating Deep Learning Models for COVID-19 Daily Incidence Data Forecasting\nTraining Loss Over Epochs")
plt.xlabel("Epochs")
plt.ylabel("Training Loss")
plt.legend()
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['left'].set_visible(False)
plt.gca().tick_params(axis='x', which='both', bottom=True, top=False)
plt.gca().tick_params(axis='y', which='both', left=False, right=False)
plt.show()
### Plot Metrics with Data Labels Inside the End of Each Bar ###
metrics_df = pd.DataFrame(metrics).T # Transpose for easier plotting
metric_names = ['RMSE', 'MAE', 'MAPE', 'EV', 'RMSLE']
metric_titles = [
'RMSE',
'MAE',
'MAPE',
'Explained Variance',
'RMSLE'
]
for i, metric in enumerate(metric_names):
plt.figure(figsize=(8, 5))
bars = plt.bar(metrics_df.index, metrics_df[metric], color='skyblue')
# Add data labels inside the end of each bar
for bar in bars:
yval = bar.get_height()
plt.text(bar.get_x() + bar.get_width()/2, yval - yval*0.05, round(yval, 2), ha='center', va='top', color="black")
plt.title(f"Model Evaluation - {metric_titles[i]}")
plt.xlabel('Models')
plt.ylabel(metric)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['left'].set_visible(False)
plt.gca().tick_params(axis='x', which='both', bottom=True, top=False)
plt.gca().tick_params(axis='y', which='both', left=False, right=False)
plt.show()
### Plot Time Series Predictions vs. Actual Values ###
plt.figure(figsize=(12, 6))
# Plot the actual COVID-19 cases
actual_cases = scaler.inverse_transform(np.concatenate([y_test.reshape(-1, 1), np.zeros((y_test.shape[0], 2))], axis=1))[:, 0]
plt.plot(actual_cases, label="Actual COVID-19 Cases", color="black", linewidth=2)
# Plot the predictions for each model
for model_name, predicted_cases in predictions.items():
plt.plot(predicted_cases, label=f"{model_name} Prediction")
plt.title("COVID-19 Cases: Actual vs. Model Predictions")
plt.xlabel("Time (days)")
plt.ylabel("COVID-19 Cases")
plt.legend()
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().tick_params(axis='x', which='both', bottom=True, top=False)
plt.gca().tick_params(axis='y', which='both', left=True, right=False)
plt.show()
Training RNN Model... 7/7 ━━━━━━━━━━━━━━━━━━━━ 0s 22ms/step Training LSTM Model... 7/7 ━━━━━━━━━━━━━━━━━━━━ 0s 26ms/step Training BiLSTM Model... 7/7 ━━━━━━━━━━━━━━━━━━━━ 1s 48ms/step Training GRU Model... 7/7 ━━━━━━━━━━━━━━━━━━━━ 0s 33ms/step
import matplotlib.dates as mdates
# @title Active Cases
fig, ax = plt.subplots(figsize=(12, 6))
# Plot the active cases using Viridis color
ax.plot(covid['date'], covid['active_cases'], label='Active Cases', color=viridis_colors[6])
# Format the x-axis as dates with a custom interval
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
ax.xaxis.set_major_locator(mdates.MonthLocator(interval=3))
# Rotate x-axis labels for readability
plt.xticks(rotation=45, ha='right')
# Set labels and title
ax.set_xlabel('Month')
ax.set_ylabel('Number of Cases')
ax.set_title('Active Cases')
# Customize the chart by hiding all spines except the bottom
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['bottom'].set_visible(True)
plt.show()
# @title Morbidity and Mortality Rates (%) Over Time
import matplotlib.dates as mdates
# create a plot
fig, ax = plt.subplots(figsize=(12, 6))
# define labels to plot
labels = ['Morbidity rate, %', 'Mortality rate, %']
cols = ['morbidity_rate', 'mortality_rate']
color_indices = [0, 5] # Adjust indices as needed for desired colors in Viridis palette
# loop through each line
for i, label in enumerate(labels):
# plot line using Viridis colors
ax.plot(covid['date'], covid[cols[i]], label=labels[i], color=viridis_colors[color_indices[i]])
# format axis date as year-month
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
# show every 3 months
ax.xaxis.set_major_locator(mdates.MonthLocator(interval=3))
# rotate the tick labels
plt.xticks(rotation=45, ha='right')
# set axis labels and title
ax.set_xlabel('Month')
ax.set_ylabel('Rate, %')
ax.set_title('Morbidity and Mortality Rates')
# Customize the chart by hiding all spines except the bottom
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['bottom'].set_visible(True)
plt.show()
# save the figure
fig.savefig('morbidity_mortality_rates.png', dpi=300, bbox_inches='tight')
# @title Share of Positive Daily Test Results
import matplotlib.dates as mdates
# Plotting the share of positive daily test results
fig, ax = plt.subplots(figsize=(12, 6))
# Use Viridis color for the plot line
ax.plot(covid['date'], covid['proportion'], color=viridis_colors[2])
# Set labels and title
ax.set_xlabel('Date')
ax.set_ylabel('Share of positive test results (%)')
ax.set_title('Share of Positive Daily Test Results')
# Format the x-axis with date intervals
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
ax.xaxis.set_major_locator(mdates.MonthLocator(interval=3))
plt.xticks(rotation=45, ha='right')
# Customize to show only the bottom spine
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['bottom'].set_visible(True)
plt.show()
# @title Breakthrough Cases (Vaccinated) Over Time
import pandas as pd
import matplotlib.pyplot as plt
# Ensure the covid dataframe has a datetime column named 'date'
covid['date'] = pd.to_datetime(covid['date'])
# Filter data to start from 2021-09-01
covid_filtered = covid[covid['date'] >= '2021-09-01']
# Aggregate cases by date for vaccinated and unvaccinated groups
breakthrough_cases = covid_filtered.groupby('date')['confirmed_covid19_cases_vaccinated'].sum()
# Plot the data
plt.figure(figsize=(12, 6))
plt.plot(breakthrough_cases.index, breakthrough_cases, label='Breakthrough Cases (Vaccinated)', color=colors[1])
# Add labels and title
plt.xlabel('Date')
plt.ylabel('Number of Cases')
plt.title('Breakthrough Cases (Vaccinated) Over Time')
plt.legend()
# Customize the chart by hiding all spines except the bottom
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['bottom'].set_visible(True)
plt.show()